Importo librerias¶

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)

Funciones¶

In [2]:
def dame_variables_categoricas(dataset=None):
    '''
    ----------------------------------------------------------------------------------------------------------
    Función dame_variables_categoricas:
    ----------------------------------------------------------------------------------------------------------
        -Descripción: Función que recibe un dataset y devuelve una lista con los nombres de las 
        variables categóricas a pesar de ello no servirá en otros ejercicios ya que hay muchas númericas que son
        1 o 0 y no servirá la función para otros trabajos
        -Inputs:  
            -- dataset: Pandas dataframe que contiene los datos
        -Return:
            -- lista_variables_categoricas: lista con los nombres de las variables categóricas del
            dataset de entrada con menos de 100 valores diferentes
            -- 1: la ejecución es incorrecta
        
    '''
    if dataset is None:
        print(u'\nFaltan argumentos por pasar a la función')
        return 1
    lista_variables_categoricas = []
    other = []
    for i in dataset.columns:
        if (dataset[i].dtype!=float) & (dataset[i].dtype!=int):
            unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
            if i in ["fraud_bool", "payment_type", "employment_status", "housing_status", "source", "device_os"]: #selecciono de manera manual las categoricas
                #ya que hay muchas numericas que son 1 o 0 por tanto no serviria esta funcion para otro trabajo
                lista_variables_categoricas.append(i)
            else:
                other.append(i)

    return lista_variables_categoricas, other

Leo el dataset¶

In [3]:
df_base = pd.read_csv("../data/Base.csv")
df_base
Out[3]:
fraud_bool income name_email_similarity prev_address_months_count current_address_months_count customer_age days_since_request intended_balcon_amount payment_type zip_count_4w velocity_6h velocity_24h velocity_4w bank_branch_count_8w date_of_birth_distinct_emails_4w employment_status credit_risk_score email_is_free housing_status phone_home_valid phone_mobile_valid bank_months_count has_other_cards proposed_credit_limit foreign_request source session_length_in_minutes device_os keep_alive_session device_distinct_emails_8w device_fraud_count month
0 1 0.9 0.166828 -1 88 50 0.020925 -1.331345 AA 769 10650.765523 3134.319630 3863.647740 1 6 CA 185 0 BA 1 0 24 0 500.0 0 INTERNET 3.888115 windows 0 1 0 7
1 1 0.9 0.296286 -1 144 50 0.005418 -0.816224 AB 366 534.047319 2670.918292 3124.298166 718 3 CA 259 1 BA 0 0 15 0 1500.0 0 INTERNET 31.798819 windows 0 1 0 7
2 1 0.9 0.044985 -1 132 40 3.108549 -0.755728 AC 870 4048.534263 2893.621498 3159.590679 1 14 CB 177 1 BA 0 1 -1 0 200.0 0 INTERNET 4.728705 other 0 1 0 7
3 1 0.9 0.159511 -1 22 50 0.019079 -1.205124 AB 810 3457.064063 4054.908412 3022.261812 1921 6 CA 110 1 BA 0 1 31 1 200.0 0 INTERNET 2.047904 linux 0 1 0 7
4 1 0.9 0.596414 -1 218 50 0.004441 -0.773276 AB 890 5020.341679 2728.237159 3087.670952 1990 2 CA 295 1 BA 1 0 31 0 1500.0 0 INTERNET 3.775225 macintosh 1 1 0 7
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
999995 0 0.6 0.192631 -1 104 40 0.030592 -1.044454 AB 804 7905.711839 8341.468557 4972.635997 1 8 CA 75 1 BC 1 1 25 0 200.0 0 INTERNET 8.511502 linux 1 1 0 4
999996 0 0.8 0.322989 148 9 50 1.628119 -1.409803 AC 3306 5391.470463 4955.170808 5022.728108 0 2 CC 154 1 BC 1 1 -1 0 200.0 0 INTERNET 8.967865 windows 0 1 0 4
999997 0 0.8 0.879403 -1 30 20 0.018563 34.692760 AA 1522 8063.102636 5670.654316 4377.196321 2023 6 CF 64 0 BC 0 1 11 0 200.0 0 INTERNET 8.195531 other 0 1 0 4
999998 0 0.9 0.762112 -1 189 20 0.015352 94.661055 AA 1418 8092.641762 3982.582204 4394.803296 1678 6 CA 163 0 BA 1 0 28 0 500.0 0 INTERNET 4.336064 windows 1 1 0 4
999999 0 0.2 0.697452 -1 321 20 2.655916 9.908499 AA 951 6169.630036 3695.308261 4352.334543 2 12 CA 36 1 BE 0 1 15 0 200.0 0 INTERNET 6.717022 linux 0 1 0 4

1000000 rows × 32 columns

Transformar los -1¶

In [4]:
df_base = df_base.copy()
for i in list(df_base.columns):
    if i in ["prev_address_months_count", "current_address_months_count", "bank_months_count",
            "session_length_in_minutes", "device_distinct_emails"]:
        df_base[i] = np.where(df_base[i] == -1, np.nan, df_base[i])
        
df_base["intended_balcon_amount"] = np.where(df_base["intended_balcon_amount"] < 0, 
                                              np.nan, df_base["intended_balcon_amount"])

Este código realiza una copia del DataFrame original y luego modifica ciertas columnas específicas, reemplazando los valores -1 con NaN en algunas columnas y los valores negativos con NaN en la columna "intended_balcon_amount". De esta manera podemos trabajar de manera mas optima en el EDA, sabiendo con precision los valores nulos.

Analisis generales de la tabla¶

Dimensión

In [5]:
print(df_base.shape, df_base.drop_duplicates().shape)
(1000000, 32) (1000000, 32)

Tipos de datos

In [6]:
df_base.dtypes.to_dict()
Out[6]:
{'fraud_bool': dtype('int64'),
 'income': dtype('float64'),
 'name_email_similarity': dtype('float64'),
 'prev_address_months_count': dtype('float64'),
 'current_address_months_count': dtype('float64'),
 'customer_age': dtype('int64'),
 'days_since_request': dtype('float64'),
 'intended_balcon_amount': dtype('float64'),
 'payment_type': dtype('O'),
 'zip_count_4w': dtype('int64'),
 'velocity_6h': dtype('float64'),
 'velocity_24h': dtype('float64'),
 'velocity_4w': dtype('float64'),
 'bank_branch_count_8w': dtype('int64'),
 'date_of_birth_distinct_emails_4w': dtype('int64'),
 'employment_status': dtype('O'),
 'credit_risk_score': dtype('int64'),
 'email_is_free': dtype('int64'),
 'housing_status': dtype('O'),
 'phone_home_valid': dtype('int64'),
 'phone_mobile_valid': dtype('int64'),
 'bank_months_count': dtype('float64'),
 'has_other_cards': dtype('int64'),
 'proposed_credit_limit': dtype('float64'),
 'foreign_request': dtype('int64'),
 'source': dtype('O'),
 'session_length_in_minutes': dtype('float64'),
 'device_os': dtype('O'),
 'keep_alive_session': dtype('int64'),
 'device_distinct_emails_8w': dtype('int64'),
 'device_fraud_count': dtype('int64'),
 'month': dtype('int64')}

Exploración de la variable objetivo y tratamiento¶

In [7]:
df_base_fraud_status = df_base['fraud_bool'] \
    .value_counts(normalize=True) \
    .mul(100).rename('percent').reset_index()

df_base_fraud_status_conteo = df_base['fraud_bool'].value_counts().reset_index()
df_base_fraud_status_pc = pd.merge(df_base_fraud_status, df_base_fraud_status_conteo, on=['fraud_bool'], how='inner')
df_base_fraud_status_pc
Out[7]:
fraud_bool percent count
0 0 98.8971 988971
1 1 1.1029 11029

Esta linea de codigo proporciona una visión detallada de la distribución de la columna 'fraud_bool', incluyendo el porcentaje de cada valor y la frecuencia absoluta.

In [8]:
fig = px.histogram(df_base_fraud_status_pc, x="fraud_bool", y=['percent'])
fig.show()

Cada barra del histograma representa un valor único en 'fraud_bool', y la altura de la barra indica el porcentaje de ocurrencia de ese valor en el conjunto de datos.

Selección de threshold por filas y columnas para eliminar valores missing¶

In [9]:
df_series_null_columns = df_base.isnull().sum().sort_values(ascending=False)
df_series_null_rows = df_base.isnull().sum(axis=1).sort_values(ascending=False)
print(df_series_null_columns.shape, df_series_null_rows.shape)

df_null_columnas = pd.DataFrame(df_series_null_columns, columns=['nulos_columnas'])     
df_null_filas = pd.DataFrame(df_series_null_rows, columns=['nulos_filas'])  
df_null_filas['target'] = df_base['fraud_bool'].copy()
df_null_columnas['porcentaje_columnas'] = df_null_columnas['nulos_columnas']/df_base.shape[0]
df_null_filas['porcentaje_filas']= df_null_filas['nulos_filas']/df_base.shape[1]
# # df_null
(32,) (1000000,)

Este código realiza un análisis detallado de los valores nulos en el DataFrame df_base, proporcionando información sobre la cantidad y el porcentaje de valores nulos en cada columna y fila, así como su relación con la variable objetivo 'fraud_bool'.

In [10]:
df_base.shape
Out[10]:
(1000000, 32)
In [11]:
df_null_columnas
Out[11]:
nulos_columnas porcentaje_columnas
intended_balcon_amount 742523 0.742523
prev_address_months_count 712920 0.712920
bank_months_count 253635 0.253635
current_address_months_count 4254 0.004254
session_length_in_minutes 2015 0.002015
fraud_bool 0 0.000000
foreign_request 0 0.000000
phone_mobile_valid 0 0.000000
has_other_cards 0 0.000000
proposed_credit_limit 0 0.000000
device_os 0 0.000000
source 0 0.000000
housing_status 0 0.000000
keep_alive_session 0 0.000000
device_distinct_emails_8w 0 0.000000
device_fraud_count 0 0.000000
phone_home_valid 0 0.000000
credit_risk_score 0 0.000000
email_is_free 0 0.000000
income 0 0.000000
employment_status 0 0.000000
date_of_birth_distinct_emails_4w 0 0.000000
bank_branch_count_8w 0 0.000000
velocity_4w 0 0.000000
velocity_24h 0 0.000000
velocity_6h 0 0.000000
zip_count_4w 0 0.000000
payment_type 0 0.000000
days_since_request 0 0.000000
customer_age 0 0.000000
name_email_similarity 0 0.000000
month 0 0.000000
In [12]:
threshold=0.9
list_vars_not_null = list(df_null_columnas[df_null_columnas['porcentaje_columnas']<threshold].index)
df_base_filter_null = df_base.loc[:, list_vars_not_null]
df_base_filter_null.shape
Out[12]:
(1000000, 32)

Filtra las columnas del DataFrame original df_base y crea un nuevo DataFrame (df_base_filter_null) que retiene solo aquellas columnas que tienen un porcentaje de valores nulos inferior al umbral especificado (90%).

In [13]:
df_null_filas
Out[13]:
nulos_filas target porcentaje_filas
526010 4 0 0.125
771103 4 0 0.125
948524 4 0 0.125
979619 4 0 0.125
553607 4 0 0.125
... ... ... ...
200377 0 0 0.000
568248 0 0 0.000
916268 0 0 0.000
916266 0 0 0.000
500000 0 0 0.000

1000000 rows × 3 columns

Tipos: Variables categoricas y numericas¶

In [14]:
list_cat_vars, other = dame_variables_categoricas(dataset=df_base_filter_null)
df_base_filter_null[list_cat_vars] = df_base_filter_null[list_cat_vars].astype("category")
df_base_filter_null[list_cat_vars].head()
Out[14]:
fraud_bool device_os source housing_status employment_status payment_type
0 1 windows INTERNET BA CA AA
1 1 windows INTERNET BA CA AB
2 1 other INTERNET BA CB AC
3 1 linux INTERNET BA CA AB
4 1 macintosh INTERNET BA CA AB

Identifica las variables categóricas en df_base_filter_null, las convierte al tipo de dato "category" y luego imprime una vista previa de esas columnas.

In [15]:
list_cat_vars
Out[15]:
['fraud_bool',
 'device_os',
 'source',
 'housing_status',
 'employment_status',
 'payment_type']
In [16]:
df_base_filter_null['month'].value_counts()
Out[16]:
month
3    150936
2    136979
0    132440
4    127691
1    127620
5    119323
6    108168
7     96843
Name: count, dtype: int64
In [17]:
df_base_filter_null['bank_months_count'].value_counts()
Out[17]:
bank_months_count
1.0     194802
28.0     80082
15.0     59141
30.0     50777
31.0     46084
25.0     40450
10.0     37158
20.0     30850
21.0     29098
5.0      28001
2.0      25836
26.0     24779
11.0     23745
6.0      17678
29.0     11696
3.0       8580
19.0      8212
22.0      6741
9.0       4801
16.0      4563
27.0      4197
4.0       3834
24.0      1822
12.0      1121
7.0        931
14.0       493
18.0       491
23.0       320
32.0        46
8.0         30
17.0         4
13.0         2
Name: count, dtype: int64
In [18]:
df_base_filter_null[list_cat_vars].dtypes
Out[18]:
fraud_bool           category
device_os            category
source               category
housing_status       category
employment_status    category
payment_type         category
dtype: object
In [19]:
df_base_filter_null[other].head(10)
Out[19]:
foreign_request phone_mobile_valid has_other_cards keep_alive_session device_distinct_emails_8w device_fraud_count phone_home_valid credit_risk_score email_is_free date_of_birth_distinct_emails_4w bank_branch_count_8w zip_count_4w customer_age month
0 0 0 0 0 1 0 1 185 0 6 1 769 50 7
1 0 0 0 0 1 0 0 259 1 3 718 366 50 7
2 0 1 0 0 1 0 0 177 1 14 1 870 40 7
3 0 1 1 0 1 0 0 110 1 6 1921 810 50 7
4 0 0 0 1 1 0 1 295 1 2 1990 890 50 7
5 0 1 0 0 1 0 0 199 1 13 5 732 30 7
6 0 1 0 1 1 0 1 272 1 10 13 876 30 7
7 1 0 0 0 1 0 1 83 1 1 40 901 50 7
8 0 1 0 1 1 0 1 222 0 4 2134 933 40 7
9 0 1 0 0 1 0 0 118 1 2 8 1176 40 7
In [20]:
print(df_base_filter_null["current_address_months_count"].value_counts().count())
df_base_filter_null["current_address_months_count"]\
    .apply(lambda x: str(x).lower().strip()).value_counts(normalize=True)
422
Out[20]:
current_address_months_count
6.0      0.016376
7.0      0.016270
8.0      0.016089
5.0      0.015896
9.0      0.015688
4.0      0.015030
10.0     0.014739
11.0     0.013988
3.0      0.013986
12.0     0.013063
2.0      0.012741
13.0     0.011802
1.0      0.011297
14.0     0.010946
15.0     0.009699
0.0      0.009609
38.0     0.009021
40.0     0.008924
41.0     0.008835
37.0     0.008831
33.0     0.008811
34.0     0.008807
36.0     0.008795
39.0     0.008776
35.0     0.008767
42.0     0.008704
16.0     0.008615
43.0     0.008436
32.0     0.008424
31.0     0.008188
44.0     0.008165
45.0     0.008041
46.0     0.007994
30.0     0.007857
17.0     0.007634
29.0     0.007436
47.0     0.007429
49.0     0.007233
48.0     0.007198
28.0     0.007161
18.0     0.007074
27.0     0.006861
50.0     0.006818
26.0     0.006560
51.0     0.006408
52.0     0.006342
19.0     0.006337
25.0     0.006331
53.0     0.006180
20.0     0.006116
23.0     0.005948
24.0     0.005934
54.0     0.005930
21.0     0.005755
22.0     0.005747
55.0     0.005625
56.0     0.005599
57.0     0.005315
58.0     0.005218
59.0     0.004995
60.0     0.004867
61.0     0.004716
63.0     0.004604
62.0     0.004570
64.0     0.004515
65.0     0.004453
67.0     0.004391
69.0     0.004295
66.0     0.004269
nan      0.004254
68.0     0.004151
71.0     0.004145
70.0     0.004079
72.0     0.004052
73.0     0.004042
74.0     0.003920
75.0     0.003852
76.0     0.003698
77.0     0.003644
78.0     0.003588
79.0     0.003470
80.0     0.003385
81.0     0.003163
82.0     0.003146
83.0     0.003081
84.0     0.002967
85.0     0.002883
86.0     0.002824
188.0    0.002766
187.0    0.002740
87.0     0.002720
189.0    0.002700
88.0     0.002698
89.0     0.002671
186.0    0.002629
115.0    0.002593
191.0    0.002588
185.0    0.002572
184.0    0.002557
113.0    0.002548
190.0    0.002540
192.0    0.002527
90.0     0.002515
112.0    0.002499
91.0     0.002499
118.0    0.002488
111.0    0.002485
194.0    0.002483
122.0    0.002468
110.0    0.002455
107.0    0.002451
114.0    0.002449
193.0    0.002447
183.0    0.002441
117.0    0.002440
124.0    0.002437
116.0    0.002418
120.0    0.002416
101.0    0.002413
92.0     0.002409
105.0    0.002409
94.0     0.002401
108.0    0.002400
123.0    0.002388
106.0    0.002388
93.0     0.002383
99.0     0.002380
95.0     0.002378
109.0    0.002377
195.0    0.002373
119.0    0.002366
182.0    0.002365
98.0     0.002364
104.0    0.002349
125.0    0.002332
102.0    0.002330
128.0    0.002328
127.0    0.002317
96.0     0.002311
121.0    0.002308
103.0    0.002295
100.0    0.002259
126.0    0.002255
97.0     0.002254
129.0    0.002217
181.0    0.002185
130.0    0.002183
132.0    0.002178
135.0    0.002175
196.0    0.002143
136.0    0.002111
180.0    0.002098
131.0    0.002078
133.0    0.002062
134.0    0.002058
138.0    0.002054
137.0    0.002046
179.0    0.002038
197.0    0.002024
139.0    0.002008
142.0    0.001987
141.0    0.001976
178.0    0.001973
140.0    0.001952
143.0    0.001945
198.0    0.001928
144.0    0.001899
145.0    0.001890
147.0    0.001890
177.0    0.001873
148.0    0.001857
149.0    0.001850
146.0    0.001825
151.0    0.001824
153.0    0.001806
152.0    0.001800
155.0    0.001783
150.0    0.001774
156.0    0.001738
199.0    0.001712
154.0    0.001708
176.0    0.001703
157.0    0.001703
160.0    0.001688
175.0    0.001660
159.0    0.001649
158.0    0.001624
164.0    0.001593
161.0    0.001592
200.0    0.001560
162.0    0.001540
163.0    0.001532
174.0    0.001523
173.0    0.001498
165.0    0.001494
166.0    0.001471
170.0    0.001463
167.0    0.001456
171.0    0.001454
201.0    0.001446
172.0    0.001444
169.0    0.001411
168.0    0.001391
202.0    0.001306
203.0    0.001250
204.0    0.001093
238.0    0.001052
235.0    0.001015
236.0    0.001013
237.0    0.000988
233.0    0.000987
241.0    0.000985
242.0    0.000982
232.0    0.000978
231.0    0.000977
240.0    0.000965
230.0    0.000963
234.0    0.000963
247.0    0.000962
228.0    0.000961
229.0    0.000953
246.0    0.000953
239.0    0.000945
226.0    0.000944
227.0    0.000940
243.0    0.000938
370.0    0.000930
205.0    0.000920
244.0    0.000914
245.0    0.000903
367.0    0.000903
224.0    0.000897
368.0    0.000874
369.0    0.000871
251.0    0.000871
225.0    0.000871
222.0    0.000860
248.0    0.000853
250.0    0.000849
371.0    0.000841
365.0    0.000840
223.0    0.000838
254.0    0.000835
373.0    0.000826
253.0    0.000826
219.0    0.000823
252.0    0.000819
207.0    0.000815
249.0    0.000815
206.0    0.000812
221.0    0.000805
364.0    0.000805
366.0    0.000804
372.0    0.000795
220.0    0.000789
255.0    0.000785
256.0    0.000785
374.0    0.000773
217.0    0.000767
363.0    0.000758
208.0    0.000753
258.0    0.000738
257.0    0.000736
375.0    0.000734
262.0    0.000729
215.0    0.000724
218.0    0.000708
209.0    0.000705
210.0    0.000698
361.0    0.000698
362.0    0.000692
259.0    0.000687
376.0    0.000682
216.0    0.000676
377.0    0.000674
261.0    0.000673
260.0    0.000672
213.0    0.000667
211.0    0.000666
212.0    0.000665
264.0    0.000626
214.0    0.000617
263.0    0.000617
360.0    0.000613
378.0    0.000612
265.0    0.000610
267.0    0.000598
268.0    0.000589
266.0    0.000587
359.0    0.000569
269.0    0.000559
358.0    0.000535
270.0    0.000532
278.0    0.000518
271.0    0.000515
299.0    0.000515
272.0    0.000513
275.0    0.000510
379.0    0.000506
302.0    0.000500
284.0    0.000498
277.0    0.000494
274.0    0.000494
273.0    0.000493
295.0    0.000492
296.0    0.000492
294.0    0.000488
380.0    0.000487
288.0    0.000483
297.0    0.000479
291.0    0.000475
276.0    0.000474
289.0    0.000474
301.0    0.000474
310.0    0.000469
309.0    0.000465
304.0    0.000465
290.0    0.000465
298.0    0.000464
307.0    0.000464
286.0    0.000462
311.0    0.000456
303.0    0.000454
280.0    0.000453
356.0    0.000453
287.0    0.000452
357.0    0.000452
312.0    0.000446
282.0    0.000446
306.0    0.000444
281.0    0.000444
300.0    0.000442
279.0    0.000436
308.0    0.000435
292.0    0.000435
293.0    0.000429
305.0    0.000428
315.0    0.000424
313.0    0.000422
321.0    0.000422
381.0    0.000419
283.0    0.000417
285.0    0.000417
317.0    0.000406
316.0    0.000401
319.0    0.000401
355.0    0.000398
314.0    0.000392
325.0    0.000391
318.0    0.000391
322.0    0.000377
324.0    0.000376
320.0    0.000374
327.0    0.000363
323.0    0.000360
326.0    0.000351
382.0    0.000343
354.0    0.000332
328.0    0.000331
383.0    0.000331
332.0    0.000328
331.0    0.000324
329.0    0.000322
352.0    0.000308
333.0    0.000290
335.0    0.000287
338.0    0.000287
353.0    0.000287
330.0    0.000286
334.0    0.000284
336.0    0.000278
384.0    0.000277
337.0    0.000267
343.0    0.000254
339.0    0.000254
385.0    0.000248
340.0    0.000243
349.0    0.000242
351.0    0.000234
344.0    0.000229
350.0    0.000229
348.0    0.000226
346.0    0.000223
341.0    0.000221
345.0    0.000215
347.0    0.000209
342.0    0.000204
386.0    0.000160
387.0    0.000131
388.0    0.000115
389.0    0.000110
390.0    0.000064
391.0    0.000048
392.0    0.000038
393.0    0.000021
394.0    0.000017
395.0    0.000015
396.0    0.000010
398.0    0.000008
397.0    0.000008
399.0    0.000007
410.0    0.000007
402.0    0.000005
407.0    0.000005
401.0    0.000005
408.0    0.000004
406.0    0.000004
400.0    0.000004
404.0    0.000004
412.0    0.000004
405.0    0.000003
425.0    0.000002
418.0    0.000002
411.0    0.000002
416.0    0.000002
417.0    0.000001
424.0    0.000001
419.0    0.000001
403.0    0.000001
428.0    0.000001
413.0    0.000001
414.0    0.000001
409.0    0.000001
Name: proportion, dtype: float64

Normaliza la columna "current_address_months_count".

Preprocesamiento inicial de algunas variables¶

In [21]:
df_base_filter_null["income"][df_base_filter_null["income"] == 0.6000000000000001] = 0.6
df_base_filter_null["income"][df_base_filter_null["income"] == 0.7000000000000001]=0.7
C:\Users\jaime\AppData\Local\Temp\ipykernel_9648\1868583841.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\jaime\AppData\Local\Temp\ipykernel_9648\1868583841.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Redondeamos los valores.

In [22]:
for i in list(df_base_filter_null.columns):
    if df_base_filter_null[i].dtype == "int64":
        df_base_filter_null[i] = df_base_filter_null[i].astype("float64")

Convierte todas las columnas tipo int64 en float64 para poder trabajar correctamente con ellas en la parte de preprocesamiento de datos.

In [23]:
df_base_filter_null.to_csv("../data/pd_data_initial_preprocessing.csv")